[1] "2.34.1"
modelling workflow for risk and safety
prepared for the Health & Safety Executive, Science Division
13 Feb 2025
BUGS/JAGS to StanStan, which tunes hyperparameters during warmup!BUGS/JAGS to StanHoffman, M.D., Gelman, A., ‘The No-U-Turn Sampler’, JMLR, 2014
BUGS/JAGS to StanStanCon anyone?
‘democratising scalable UQ’ …once installed 🤔
looking for evidence of active corrosion growth
3×5 DataFrame
Row │ anomaly_id soil_type inspection measured_depth_mm T
│ Int64 String1 String3 Float64 Int64
─────┼─────────────────────────────────────────────────────────────
1 │ 1 A i_0 0.0672091 0
2 │ 2 A i_0 0.104202 0
3 │ 3 A i_0 0.100588 0
data {
int<lower=0> n_anomalies; // number of anomalies
int<lower=0> n_inspections; // number of inspections
vector[n_anomalies * n_inspections] cgr; // growth rate observations (2 per anomaly)
}
parameters {
real<lower=0> mu; // mean growth rate
real<lower=0> sigma; // standard deviation
}
model {
// model
for (i in 1 : (n_anomalies * n_inspections)) {
cgr[i] ~ normal(mu, sigma);
}
/*
//alternative (vectorised) implementation:
cgr ~ normal(mu, sigma);
//some suggested priors
mu ~ normal(1/4, 3);
sigma ~ exponential(1);
*/
}
generated quantities {
/*
vector[n_anomalies * n_inspections] log_lik;
real cgr_pred = normal_rng(mu, sigma);
for (i in 1:(n_anomalies * n_inspections)) {
log_lik[i] = normal_lpdf(delta_C[i] | mu, sigma);
}
*/
}
INFO:cmdstanpy:found newer exe file, not recompiling
data {
int <lower = 0> n_anomalies; // number of anomalies
int <lower = 0> n_inspections; // number of inspections
vector[n_anomalies * n_inspections] cgr; // growth rate observations (2 per anomaly)
}
parameters {
real<lower = 0> mu; // mean growth rate
real<lower = 0> sigma; // standard deviation
}
model {
// model
for(i in 1:(n_anomalies * n_inspections)){
cgr[i] ~ normal(mu, sigma);
}
/*
//alternative (vectorised) implementation:
cgr ~ normal(mu, sigma);
//some suggested priors
mu ~ normal(1/4, 3);
sigma ~ exponential(1);
*/
}
generated quantities {
/*
vector[n_anomalies * n_inspections] log_lik;
real cgr_pred = normal_rng(mu, sigma);
for (i in 1:(n_anomalies * n_inspections)) {
log_lik[i] = normal_lpdf(delta_C[i] | mu, sigma);
}
*/
}
CmdStanR needs it’s input data as a list
prepare_data <- function(df = corrosion_data) {
df |>
arrange(anomaly_id, T) |>
group_by(anomaly_id) |>
mutate(
next_depth = lead(measured_depth_mm),
time_diff = lead(T) - T
) |>
filter(!is.na(next_depth)) |>
mutate(
delta_C = (next_depth - measured_depth_mm) / time_diff
) |>
select(anomaly_id, delta_C, soil_type) |>
ungroup()
}
model_data <- list(
n_anomalies = prepare_data()$anomaly_id |> unique() |> length(),
n_inspections = 2,
cgr = prepare_data()$delta_C
)
cgr_post <- cgr_model$sample(data = model_data)Running MCMC with 4 sequential chains...
Chain 1 Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 1 Iteration: 100 / 2000 [ 5%] (Warmup)
Chain 1 Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 1 Iteration: 300 / 2000 [ 15%] (Warmup)
Chain 1 Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 1 Iteration: 500 / 2000 [ 25%] (Warmup)
Chain 1 Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 1 Iteration: 700 / 2000 [ 35%] (Warmup)
Chain 1 Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 1 Iteration: 900 / 2000 [ 45%] (Warmup)
Chain 1 Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 1 Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 1 Iteration: 1100 / 2000 [ 55%] (Sampling)
Chain 1 Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 1 Iteration: 1300 / 2000 [ 65%] (Sampling)
Chain 1 Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 1 Iteration: 1500 / 2000 [ 75%] (Sampling)
Chain 1 Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 1 Iteration: 1700 / 2000 [ 85%] (Sampling)
Chain 1 Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 1 Iteration: 1900 / 2000 [ 95%] (Sampling)
Chain 1 Iteration: 2000 / 2000 [100%] (Sampling)
Chain 1 finished in 0.1 seconds.
Chain 2 Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 2 Iteration: 100 / 2000 [ 5%] (Warmup)
Chain 2 Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 2 Iteration: 300 / 2000 [ 15%] (Warmup)
Chain 2 Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 2 Iteration: 500 / 2000 [ 25%] (Warmup)
Chain 2 Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 2 Iteration: 700 / 2000 [ 35%] (Warmup)
Chain 2 Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 2 Iteration: 900 / 2000 [ 45%] (Warmup)
Chain 2 Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 2 Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 2 Iteration: 1100 / 2000 [ 55%] (Sampling)
Chain 2 Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 2 Iteration: 1300 / 2000 [ 65%] (Sampling)
Chain 2 Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 2 Iteration: 1500 / 2000 [ 75%] (Sampling)
Chain 2 Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 2 Iteration: 1700 / 2000 [ 85%] (Sampling)
Chain 2 Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 2 Iteration: 1900 / 2000 [ 95%] (Sampling)
Chain 2 Iteration: 2000 / 2000 [100%] (Sampling)
Chain 2 finished in 0.1 seconds.
Chain 3 Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 3 Iteration: 100 / 2000 [ 5%] (Warmup)
Chain 3 Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 3 Iteration: 300 / 2000 [ 15%] (Warmup)
Chain 3 Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 3 Iteration: 500 / 2000 [ 25%] (Warmup)
Chain 3 Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 3 Iteration: 700 / 2000 [ 35%] (Warmup)
Chain 3 Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 3 Iteration: 900 / 2000 [ 45%] (Warmup)
Chain 3 Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 3 Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 3 Iteration: 1100 / 2000 [ 55%] (Sampling)
Chain 3 Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 3 Iteration: 1300 / 2000 [ 65%] (Sampling)
Chain 3 Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 3 Iteration: 1500 / 2000 [ 75%] (Sampling)
Chain 3 Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 3 Iteration: 1700 / 2000 [ 85%] (Sampling)
Chain 3 Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 3 Iteration: 1900 / 2000 [ 95%] (Sampling)
Chain 3 Iteration: 2000 / 2000 [100%] (Sampling)
Chain 3 finished in 0.1 seconds.
Chain 4 Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 4 Iteration: 100 / 2000 [ 5%] (Warmup)
Chain 4 Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 4 Iteration: 300 / 2000 [ 15%] (Warmup)
Chain 4 Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 4 Iteration: 500 / 2000 [ 25%] (Warmup)
Chain 4 Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 4 Iteration: 700 / 2000 [ 35%] (Warmup)
Chain 4 Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 4 Iteration: 900 / 2000 [ 45%] (Warmup)
Chain 4 Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 4 Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 4 Iteration: 1100 / 2000 [ 55%] (Sampling)
Chain 4 Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 4 Iteration: 1300 / 2000 [ 65%] (Sampling)
Chain 4 Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 4 Iteration: 1500 / 2000 [ 75%] (Sampling)
Chain 4 Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 4 Iteration: 1700 / 2000 [ 85%] (Sampling)
Chain 4 Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 4 Iteration: 1900 / 2000 [ 95%] (Sampling)
Chain 4 Iteration: 2000 / 2000 [100%] (Sampling)
Chain 4 finished in 0.1 seconds.
All 4 chains finished successfully.
Mean chain execution time: 0.1 seconds.
Total execution time: 0.7 seconds.
CmdStanPy needs it’s input data as a dictionary
def prepare_data(df = corrosion_data):
return (
df.sort(['anomaly_id', 'T'])
.group_by('anomaly_id')
.agg([
pl.col('measured_depth_mm').shift(-1).alias('next_depth'),
pl.col('T').shift(-1).alias('next_time'),
pl.col('measured_depth_mm'),
pl.col('T')
])
.filter(pl.col('next_depth').is_not_null())
.with_columns([
((pl.col('next_depth') - pl.col('measured_depth_mm')) /
(pl.col('next_time') - pl.col('T'))).alias('delta_C')
])
.select(['anomaly_id', 'delta_C'])
.explode('delta_C') # Add this line to unnest the lists
.filter(pl.col('delta_C').is_not_null()) # Optional: remove null values if any
)
model_data = {
'n_anomalies': prepare_data().select('anomaly_id').unique().height,
'n_inspections': 2,
'cgr': prepare_data().select('delta_C').to_series().to_numpy()
}
cgr_post = cgr_model.sample(data = model_data)
INFO:cmdstanpy:CmdStan start processing
chain 1 |[33m [0m| 00:00 Status
chain 2 |[33m [0m| 00:00 Status[A
chain 3 |[33m [0m| 00:00 Status[A[A
chain 4 |[33m [0m| 00:00 Status[A[A[A
chain 1 |[33m4 [0m| 00:00 Status
chain 4 |[33m4 [0m| 00:00 Status[A[A[A
chain 2 |[33m4 [0m| 00:00 Status[A
chain 3 |[33m4 [0m| 00:00 Status[A[A
chain 1 |[34m##########[0m| 00:00 Sampling completed
chain 1 |[34m##########[0m| 00:00 Sampling completed
chain 2 |[34m##########[0m| 00:00 Sampling completed
chain 3 |[34m##########[0m| 00:00 Sampling completed
chain 4 |[34m##########[0m| 00:00 Sampling completed
INFO:cmdstanpy:CmdStan done processing.
A Turing model needs it’s input data as arguments to the model function
function prepare_data(df::DataFrame = corrosion_data)
sorted_df = sort(df, [:anomaly_id, :T]); result = DataFrame()
for group in groupby(sorted_df, :anomaly_id)
if nrow(group) > 1
for i in 1:(nrow(group)-1)
Δc = (group[i+1, :measured_depth_mm] - group[i, :measured_depth_mm]) / (group[i+1, :T] - group[i, :T])
push!(result, (
anomaly_id = group[i, :anomaly_id], Δc = max(0, Δc)
))
end
end
end
return result
end
cgr_post = prepare_data().Δc |>
data -> corrosion_growth(data) |>
model -> sample(model, NUTS(), 1_000)# A draws_df: 1000 iterations, 4 chains, and 3 variables
lp__ mu sigma
1 130 0.079 0.043
2 130 0.082 0.038
3 130 0.083 0.042
4 129 0.073 0.043
5 126 0.098 0.049
6 131 0.081 0.040
7 130 0.081 0.042
8 130 0.082 0.039
9 131 0.082 0.040
10 130 0.080 0.045
# ... with 3990 more draws
# ... hidden reserved variables {'.chain', '.iteration', '.draw'}
# A tibble: 8,000 × 4
Parameter Chain Iteration value
<chr> <int> <int> <dbl>
1 mu 1 1 0.0791
2 mu 1 2 0.0819
3 mu 1 3 0.0830
4 mu 1 4 0.0733
5 mu 1 5 0.0976
6 mu 1 6 0.0812
7 mu 1 7 0.0810
8 mu 1 8 0.0817
9 mu 1 9 0.0821
10 mu 1 10 0.0795
# ℹ 7,990 more rows
array([[[ 1.30191e+02, 8.70383e-01, 9.39165e-01, ..., -1.26941e+02,
8.52370e-02, 3.81699e-02],
[ 1.30448e+02, 9.98278e-01, 7.99502e-01, ..., -1.30004e+02,
7.99275e-02, 3.93765e-02],
[ 1.29385e+02, 9.17764e-01, 8.69625e-01, ..., -1.29304e+02,
7.86762e-02, 3.51568e-02],
[ 1.29298e+02, 9.68380e-01, 9.22988e-01, ..., -1.28776e+02,
8.26302e-02, 4.74505e-02]],
[[ 1.26976e+02, 3.14248e-01, 9.39165e-01, ..., -1.26959e+02,
9.13616e-02, 3.39815e-02],
[ 1.30210e+02, 9.42038e-01, 7.99502e-01, ..., -1.29967e+02,
7.87354e-02, 4.27150e-02],
[ 1.29867e+02, 1.00000e+00, 8.69625e-01, ..., -1.29443e+02,
7.89771e-02, 3.63841e-02],
[ 1.30041e+02, 1.00000e+00, 9.22988e-01, ..., -1.29066e+02,
7.81961e-02, 4.35307e-02]],
[[ 1.26912e+02, 1.00000e+00, 9.39165e-01, ..., -1.24507e+02,
7.17614e-02, 5.15462e-02],
[ 1.29926e+02, 9.28091e-01, 7.99502e-01, ..., -1.29188e+02,
8.80497e-02, 4.02009e-02],
[ 1.30283e+02, 9.30620e-01, 8.69625e-01, ..., -1.29006e+02,
8.18088e-02, 4.31645e-02],
[ 1.29819e+02, 9.77955e-01, 9.22988e-01, ..., -1.29577e+02,
8.15160e-02, 3.57827e-02]],
...,
[[ 1.28314e+02, 9.52373e-01, 9.39165e-01, ..., -1.27377e+02,
7.13041e-02, 4.61701e-02],
[ 1.25893e+02, 8.52665e-01, 7.99502e-01, ..., -1.25517e+02,
6.76826e-02, 3.57083e-02],
[ 1.28347e+02, 9.43010e-01, 8.69625e-01, ..., -1.25492e+02,
7.04713e-02, 4.47044e-02],
[ 1.29229e+02, 9.98613e-01, 9.22988e-01, ..., -1.28917e+02,
8.65338e-02, 3.54272e-02]],
[[ 1.29311e+02, 9.49483e-01, 9.39165e-01, ..., -1.27388e+02,
7.76483e-02, 3.53334e-02],
[ 1.28376e+02, 1.00000e+00, 7.99502e-01, ..., -1.26377e+02,
7.17200e-02, 3.71691e-02],
[ 1.29602e+02, 9.70368e-01, 8.69625e-01, ..., -1.27782e+02,
8.70311e-02, 3.67472e-02],
[ 1.29834e+02, 1.00000e+00, 9.22988e-01, ..., -1.29222e+02,
8.70133e-02, 3.76706e-02]],
[[ 1.30377e+02, 9.56418e-01, 9.39165e-01, ..., -1.28847e+02,
8.43941e-02, 3.90515e-02],
[ 1.30451e+02, 1.00000e+00, 7.99502e-01, ..., -1.28501e+02,
7.99870e-02, 4.10477e-02],
[ 1.29507e+02, 9.81135e-01, 8.69625e-01, ..., -1.29301e+02,
8.67452e-02, 3.62670e-02],
[ 1.30448e+02, 1.00000e+00, 9.22988e-01, ..., -1.29851e+02,
8.38798e-02, 4.08904e-02]]])
lp__ accept_stat__ stepsize__ ... energy__ mu sigma
0 130.191 0.870383 0.939165 ... -126.941 0.085237 0.038170
1 126.976 0.314248 0.939165 ... -126.959 0.091362 0.033981
2 126.912 1.000000 0.939165 ... -124.507 0.071761 0.051546
3 129.682 1.000000 0.939165 ... -126.943 0.074709 0.042224
4 129.441 0.966865 0.939165 ... -129.286 0.089871 0.039224
... ... ... ... ... ... ... ...
3995 130.308 0.996709 0.922988 ... -130.045 0.085223 0.039215
3996 129.321 0.851002 0.922988 ... -129.233 0.076897 0.046297
3997 129.229 0.998613 0.922988 ... -128.917 0.086534 0.035427
3998 129.834 1.000000 0.922988 ... -129.222 0.087013 0.037671
3999 130.448 1.000000 0.922988 ... -129.851 0.083880 0.040890
[4000 rows x 9 columns]
1000×16 DataFrame
Row │ iteration chain μ σ lp n_steps is_accept a ⋯
│ Int64 Int64 Float64 Float64 Float64 Float64 Float64 F ⋯
──────┼─────────────────────────────────────────────────────────────────────────
1 │ 501 1 0.063576 0.0523519 84.1164 3.0 1.0 ⋯
2 │ 502 1 0.0770471 0.0517853 84.9852 3.0 1.0
3 │ 503 1 0.0829532 0.0434559 85.9066 3.0 1.0
4 │ 504 1 0.0756827 0.0432456 85.9647 3.0 1.0
5 │ 505 1 0.0776469 0.0458524 85.8984 3.0 1.0 ⋯
6 │ 506 1 0.0837327 0.0447781 85.7045 7.0 1.0
7 │ 507 1 0.0548135 0.0606006 82.4548 7.0 1.0
8 │ 508 1 0.0821371 0.0427293 86.0145 7.0 1.0
⋮ │ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋱
994 │ 1494 1 0.0823426 0.0399723 85.9794 1.0 1.0 ⋯
995 │ 1495 1 0.0828584 0.0466502 85.5479 3.0 1.0
996 │ 1496 1 0.084823 0.035643 84.8893 7.0 1.0
997 │ 1497 1 0.0736374 0.0499017 85.3036 3.0 1.0
998 │ 1498 1 0.0787466 0.0416955 86.112 3.0 1.0 ⋯
999 │ 1499 1 0.0835207 0.0386875 85.7618 3.0 1.0
1000 │ 1500 1 0.0763782 0.0472499 85.7274 7.0 1.0
9 columns and 985 rows omitted
success
next up: how we can extend this to a robust and helpful workflow
☕
Turing.jln_draws = 1_000; n_chains = 4
# a no U-turn sampler, with 2000 adaptive steps and a target acceptance rate of 0.65
NUTS_sampler = NUTS(2_000, 0.65)
# a Hamiltonian Monte Carlo sampler, with a step size of 0.05 and 10 leapfrog steps
HMC_sampler = HMC(0.05, 10)
# a Metropolis-Hastings sampler, using the default proposal distribution (priors)
MH_sampler = MH()
# a 'compositional' Gibbs sampler (Metropolis within Gibbs) - sampling μ with MH and σ with NUTS
Gibbs_sampler = Gibbs(MH(:μ), NUTS(2_000, 0.65, :σ))
run_mcmc = function(sampler)
return prepare_data().Δc |>
data -> corrosion_growth(data) |>
model -> sample(model, sampler, MCMCThreads(), n_draws, n_chains)
end